Assignment 03

Author
Affiliation

Devin Blanchard

Boston University

Published

September 24, 2025

Modified

October 17, 2025

1 Data Loading and Inspection

import pandas as pd
import plotly.express as px
import plotly.io as pio
from pyspark.sql import SparkSession
import re
import numpy as np
import plotly.graph_objects as go
from pyspark.sql.functions import col, split, explode, regexp_replace, transform, when
from pyspark.sql import functions as F
from pyspark.sql.functions import col, monotonically_increasing_id

np.random.seed(42)

pio.renderers.default = "notebook"

# Initialize Spark Session
spark = SparkSession.builder.appName("LightcastData").getOrCreate()

# Load Data
df = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("./data/lightcast_job_postings.csv")
df.createOrReplaceTempView("job_postings")

# Show Schema and Sample Data
#print("---This is Diagnostic check, No need to print it in the final doc---")

#df.printSchema() # comment this line when rendering the submission
#df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/17 13:20:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 1) / 1]                                                                                [Stage 1:>                                                          (0 + 1) / 1]                                                                                25/10/17 13:21:08 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

2 Data Cleaning

from pyspark.sql.functions import col

df = df.withColumn("SALARY", col("SALARY").cast("float"))
df = df.withColumn("SALARY_FROM", col("SALARY_FROM").cast("float"))
df = df.withColumn("SALARY_TO", col("SALARY_TO").cast("float"))
df = df.withColumn("MIN_YEARS_EXPERIENCE", col("MIN_YEARS_EXPERIENCE").cast("float"))
df = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))

# median_from, median_to, median_salary
median_from = df.approxQuantile("SALARY_FROM", [0.5], 0.01)[0]
median_to = df.approxQuantile("SALARY_TO", [0.5], 0.01)[0]
median_salary = df.approxQuantile("SALARY", [0.5], 0.01)[0]

print(median_from, median_to, median_salary)

# Impute missing 'SALARY_FROM' and 'SALARY_TO' with their medians
df = df.fillna({
    "SALARY_FROM": median_from,
    "SALARY_TO": median_to
})

# Compute 'AVERAGE_SALARY'
df = df.withColumn(
    "AVERAGE_SALARY",
    (col("SALARY_FROM") + col("SALARY_TO")) / 2
)

# Impute missing 'SALARY' with AVERAGE_SALARY, and if that's missing, with the median salary
from pyspark.sql.functions import when

df = df.withColumn(
    "SALARY",
    when(
        col("SALARY").isNull(),
        when(col("AVERAGE_SALARY").isNotNull(), col("AVERAGE_SALARY"))
        .otherwise(median_salary)
    ).otherwise(col("SALARY"))
)

from pyspark.sql.functions import regexp_replace

df = df.withColumn(
    "EDUCATION_LEVELS_NAME",
    regexp_replace(col("EDUCATION_LEVELS_NAME"), r'[\n\r]', '')
)

# Overwrite
df.write.option("header", True).mode("overwrite").csv("data/lightcast_job_postings_cleaned.csv")

# Display row count
print(f"Rows retained after cleaning: {df.count()}")
[Stage 2:>                                                          (0 + 1) / 1]                                                                                [Stage 3:>                                                          (0 + 1) / 1]                                                                                [Stage 4:>                                                          (0 + 1) / 1]                                                                                
87295.0 130042.0 115024.0
[Stage 5:>                                                          (0 + 1) / 1]                                                                                [Stage 6:>                                                          (0 + 1) / 1]
Rows retained after cleaning: 72498
                                                                                

3 Salary Distribution by Industry and Employment Type

# Filter for non-missing, nonzero salaries (use SALARY, not SALARY_FROM)

plot_df = df.select("NAICS2_NAME", "SALARY").filter(col("SALARY") > 0).toPandas()

fig = px.box(
    plot_df,
    x="NAICS2_NAME",
    y="SALARY",
    points="all",
    title="Salary Distribution by Industry",
)

fig.update_traces(
    marker=dict(color='rgb(52,152,219)', opacity=0.5, size=4),
    line=dict(color='rgb(41,128,185)'),
    fillcolor='rgba(41,128,185,0.3)',
    jitter=0,
    pointpos=0
)

fig.update_layout(
    xaxis_title="Industry",
    yaxis_title="SALARY",
    font=dict(size=14, family="Arial"),
    plot_bgcolor="#F4F8FF",
    paper_bgcolor="#F4F8FF",
    xaxis_tickangle=-90,
    height=1200,
    width=700,
    showlegend=False,
    boxmode='overlay'
)

fig.show()
[Stage 9:>                                                          (0 + 1) / 1]                                                                                

Salaries vary widely between industries, with sectors like Information and Finance & Insurance generally showing higher salary ranges than industries such as Accommodation and Food Services. Full-time positions tend to have higher median salaries across most industries compared to part-time or other employment types.